Read in source data.
library(tidyverse)
library(mrcommons)
library(mrdrivers)
library(magrittr)
# reed in source data
foodEmi <- readSource("EDGARfood", subtype="foodSystemSector")
foodBal <- calcOutput("FAOmassbalance", aggregate = F)
Process source data
# This chunk will generate a dataset where the observations are year-country combinations.
# The features are greenhouse gas emissions of different food system sectors "GWP..." and FAO data food system metrics for the country "k..."
#interpolate the FAO data
foodBal <- time_interpolate(foodBal, seq(1990,2018) )
# aggregate food products to kcr kli kds k
# for a longer version of the abbreviations run:
# View(read.csv(system.file("extdata", "reportingnames.csv", package = "magpiesets")))
# also "magpiesets\doc\magpiesets.html" in the R library folder is helpful
sets <- read.csv(system.file("extdata", "magpiesets.csv", package = "magpiesets"))
sets %>% select(all,kcr,kli,ksd,k) -> mapping
foodBal %>% toolAggregate(rel= slice(mapping, 1:44),from="all",to="kcr+kli+ksd+k" ,
dim = 3.1, partrel =T) %>%
.[,,"",invert=T] -> foodBal_p
# subset the data that will be used for the emission factors
foodBal_p <- foodBal_p[,,c("wm","dm","nr")][,,c("production","food", "milling" ,
"refining", "extracting","fermentation",
"distilling", "waste", "domestic_supply",
"households")]
# aggregate the processing items to processing category
map2 <- tibble(source= c("production", "food", "milling", "refining", "extracting","fermentation", "distilling",
"waste", "domestic_supply", "households") ,
tar = c("production", "processing", "processing" ,"processing", "processing","processing", "processing",
"waste","domestic_supply", "households"))
foodBal_p2<- toolAggregate(foodBal_p, map2, from="source", to="tar",dim=3.2)
# pivot the objects to wide format, such that observations are country-year combinations
pivot_wider(as.data.frame( foodBal_p2), id_cols = c("Year", "Region"), names_from = starts_with("Data") , values_from = "Value") ->datBal
pivot_wider(as.data.frame( foodEmi), id_cols = c("Year", "Region"), names_from = starts_with("Data") , values_from = "Value") ->datEmi
#join emission data with FAO data
data<- left_join(datEmi, datBal)
Add possible drivers/predictors to the data.
########## add drivers to the data
### add extra information from foodBal/FAO as drivers
# add finer resolved processing and production items
foodBal[,,c("food","milling","refining","extracting" ,"distilling") ][,,"dm"] %>% dimSums(.,dim=3.2) %>% collapseDim() %>% as.data.frame() %>%
pivot_wider(names_from = Data1, values_from = "Value") -> driv_procc
foodBal[,,c("production") ][,,"dm"] %>% collapseDim() %>% as.data.frame() %>%
pivot_wider(names_from = Data1, values_from = "Value") -> driv_prod
driv_procc %>% rename_with( .cols = !(1:3) , .fn = ~ paste0("driv_proc_", .x) ) -> driv_procc
driv_prod %>% rename_with( .cols = !(1:3) , .fn = ~ paste0("driv_prod_", .x) ) -> driv_prod
data<- left_join(data, driv_procc )
data<- left_join(data, driv_prod )
#add export as driver
exp <- dimSums(foodBal[,,"export"][,,"wm"], dim=3)
data<- left_join( data, select(as.data.frame( exp) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_exp = Value)
### add drivers from other data sources in the mr universe
#read data
gdp <- calcOutput("GDPPast", aggregate = F)
pop <- calcOutput("PopulationPast", aggregate = F)
affl <- collapseDim(gdp)/collapseDim(pop)
urb <- calcOutput("UrbanPast", aggregate = F)
dS <- calcOutput("DevelopmentState", aggregate = F) # needs to be interpolated
dS <- time_interpolate(dS, 1990:2018)
govI <- calcOutput("GovernanceIndicator", aggregate = F) # only from 1995
govI <- time_interpolate(govI, 1990:2018)
#add data
data<- left_join( data, select(as.data.frame( affl) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_aff = Value)
data<- left_join( data, select(as.data.frame( pop) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_pop = Value)
data<- left_join( data, select(as.data.frame( gdp) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_gdp = Value)
data<- left_join( data, select(as.data.frame( urb) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_urb = Value)
data<- left_join( data, select(as.data.frame( govI[,,"SSP1"]) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_govI = Value)
data<- left_join( data, select(as.data.frame( dS[,,"SSP1"]) , Year, Region, Value),
by=c("Region","Year")) %>% rename(driv_ec_dS = Value)
## add some informative data about regions and country names (for plotting)
regmap <- toolGetMapping("regionmappingH12.csv")
data<- left_join(data, regmap %>% rename(Region=CountryCode) ,by="Region") %>%
rename( info_countrynames = X, info_largReg = RegionCode)
### adding principal components since they capture the variance with less number of variables
# data %>% select(starts_with("k")) %>% prcomp ->k_pca
#
# pcas <- as.data.frame( k_pca$x[,1:12])
#
# names(pcas) <- paste0("K_", names(pcas))
#
# data <- cbind(data, pcas)
Remove data that un not usable for the regression.
### REMOVE ROWS
#sort out countries that are completely zero in fooBal
data %>% group_by(Region) %>% summarise(across(starts_with("k"), sum)) %>% rowwise %>%
mutate(sum= sum(across(starts_with("k") )) ) %>% filter(sum==0) %>% pull(Region) -> sortout # zeros
data %<>% filter(! Region %in% sortout)
#remove rows with only NAs in foodBal columns #TODO: document which year_countries are removed
#data %>% mutate(sum= rowSums( across( starts_with("k"), function(x) is.na(x) ) )) %>% filter(sum < ) %>%
# select(!sum) -> data
### REMOVE COLS
#remove emission columns that are completely zero
data %<>% select(
!( contains("F-gases" )&!contains("Packaging")&!contains("Retail") ) #evaluates to FALSE when columns have F-gases but not Packaging or retail in their name
)
#remove households_dm which is a completly zero data column
data %<>% select(!(contains("households_dm") ))
# remove cell column that isn't needed
data %<>% select(-Cell)
#remove any other numeric columns with absolute value that sums up to zero i.e. data that exists but is always zero
data %>% summarise(across(everything(), function(x) ifelse(is.numeric(x), sum(abs(x), na.rm = TRUE)>0, TRUE) )) %>% unlist() ==0 -> zerocols
which(zerocols) # print them
## driv_proc_distillers_grain driv_proc_ethanol
## 89 90
## driv_proc_oilcakes driv_proc_fibres
## 94 109
## driv_proc_foddr driv_proc_begr
## 111 117
## driv_proc_betr driv_proc_pasture
## 118 119
## driv_proc_res_cereals driv_proc_res_fibrous
## 120 121
## driv_proc_res_nonfibrous driv_proc_scp
## 122 123
## driv_proc_wood driv_proc_woodfuel
## 124 125
## driv_prod_begr driv_prod_betr
## 158 159
## driv_prod_scp
## 164
data[!zerocols] ->data #remove them
Emission factors assume a linear relationship between the emissions from a given source and a related (economic) activity. While the linear relation does not need to be the same for all countries and times, a good starting point to choose which emission to link to which activity might still be the global correlation coefficients.
### corellation plots
data %>% select(!Year&!Region &!starts_with("driv_")&!starts_with("inf")&!starts_with("K_PC") )-> datac
datac %>% drop_na() %>% cor() %>%
.[1:26,27:ncol(datac)] %>% t -> tem
tem[order(str_extract(row.names(tem), "_.*_") ) , ] %>% corrplot::corrplot( col.lim = c(-0.1,1), method = "number")
The correlation structure is less clear than one could hope for. For
the production emissions it seems common-sensical to choose a production
quantity as a denominator. “k_production_dm” seems to be a good first
idea due to the high correlations.
For the processing emissions there is no single item with high
correlations. “k_processing_dm” is chosen for N2O , “kli_processing_dm”
for CO2 and “kcr_processing_wm” for CH4.
As said the emission factors can vary for different times and regions. We are interested in finding the underlying drivers in order to predict emissions for different economic scenarios.
A simple linear regression model for the emission factors is
\[ emission_{g,c,y}/ m_{c,y} =a+b_{1}* d_{1,c,y}+b_2* d_{2,c,y} + b_3* d_{3,c,y}+ \cdots + \eta \] with \[\eta \sim N(0,\epsilon )\] g is the GHG, c is the country, y is the year, a is the intercept, \(b_i\) are the slopes, \(d_{i,c,y}\) are the drivers in the data and \(\eta\) is the normally distributed error.
Since we are interested in predicting the emissions and not the emission factor in the end it could make sense to change the model a bit by bringing the factor \(m_{c,y}\) to the RHS. The least squares estimates will minimize residual squares for the emissions and not the emission factors then.
\[ emission_{g,c,y} =a* m_{c,y}+ b_1* d_{1,c,y}* m_{c,y}+b_2* d_{2,c,y}*m_{c,y} + b_3* d_{3,c,y}*m_{c,y}+ \cdots + \gamma \] with \[\gamma \sim N(0,\delta )\]
In the following the two options with total emissions as dependent variable and with emission factors as depended variable will both be explored.
It should be mentioned that a standard assumption of this regression model is violated in our case: The observations \(emission_{g,c,y}\) \(m_{c,y}\) \(d_{i,c,y}\) for different values of c and y are not independent. Hence the results, especially the significance of influence of drivers, need to be treated with caution. A way of avoiding this assumption, would be the use of mixed effect models. However, to get a first idea of the drivers the standard model is used.
The following linear models where build by successive elimination of independent variables, mainly based on p-values. Further olsrr a tool for automatic selection of independent variables was used once the number of variables was small enough (i.e. below 11).
To facilitate the elimination process the following function elimHelper is used.
#lR < data frame; basis for the linear model. Column "emiFac" is the dependent variable. All other variables will be treated as independend variables.
#withRval < Boolean; show R values in addition to p values?
#auto_elim < integer; automatically eliminate variables with highes p-value until auto_elim variables are left
#weight < vector; true use vec as a weight in regression
elimHelper <- function(lR, withRval, auto_elim, weight) {
useweight= !is.null(weight)
# automatically eliminate the values with highest p value unit cap of auto_elim is reached
if(ncol(lR)>auto_elim)
{
while(ncol(lR)>auto_elim){
if(useweight){ model <- lm( emiFac ~ . , data = lR, weights = weight)}
else{ model <- lm( emiFac ~ . , data = lR)}
p<-summary(model)$coefficients[,4]
p<- p[!names(p) %in% "(Intercept)"]
elim<-which.max(p)
if(names(elim) %in% colnames(lR)){
lR %<>% select(!names(elim))
}
else{
break
}
}
return(lR)
}
# main elimination loop
flag=T # will be set to FALSE one "stop" is typed in
while(flag)
{
# set up model and output r squared
#browser()
if(useweight){ model <- lm( emiFac ~ . , data = lR, weights = weight)}
else{ model <- lm( emiFac ~ . , data = lR)}
print(summary(model)$r.squared)
# print p-values
p<-summary(model)$coefficients[,4]
print("\n p-value")
print(sort(p))
#print R2-values if withRval=TRUE
if(withRval){
f=NULL
cN <- colnames(lR)
for (n in cN[2:length(cN)]){
lR2 <- lR %>% select( -n)
if(useweight){ model2 <- lm( emiFac ~ . , data = lR, weights = weight)}
else{ model2 <- lm( emiFac ~ . , data = lR)}
r2 <- summary(model2)$r.squared
add <- setNames( r2,n)
f <- c(f, add)
}
print("\n r-change")
print(sort(f))
}
#nested loop to determine the next variabe that is to be eliminated based on user input
flag2=TRUE #flag for loop to ask for the correct variable to be eliminated
while(flag2){
a=readline()
if(a=="stop"){
flag<-FALSE
flag2<-FALSE
}
else{
print(lR %>% select(matches(a)) %>% names())
b=readline()
if(b=="o"){ # confirm the variable with "o" for "ok"
flag2<-FALSE # the correct next variable to eliminate was confirmed
lR %<>% select(!matches(a))
}
}
}
}
return(lR)
}
Plot the distribution of emission factors and get information about possible outliers.
#### N2O processing emissions
# best denominator for emission factors: k_processing_dm , alternatively kli_processing_dm
# add depended variable
datN2OProc <- data %>% filter(k_processing_dm!=0) %>%
mutate(emiFac=GWP_100_N2O_Processing/k_processing_dm)
hist(datN2OProc$emiFac,100)
#VIEW EXTREME Emission Facs
datN2OProc %>% filter(emiFac<0 | emiFac>150) %>% select(emiFac, GWP_100_N2O_Processing, k_processing_dm, Region, info_countrynames ,Year) %>% DT::datatable(.)
Build model
# select all drivers to start the elimination process
datN2OProc_Model <- datN2OProc %>% select(emiFac,starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) )
# best predictors where found by successive elimination of variables with these commands:
# lR_mreduced<-elimHelper_proc(datN2OProc)
# subs <- olsrr::ols_step_all_possible(m)
# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_exp + driv_ec_govI + driv_ec_dS, datN2OProc )
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datN2OProc %>% mutate(fitted= m$fitted.values) -> datN2OProc
summary(m)
##
## Call:
## lm(formula = emiFac ~ driv_ec_exp + driv_ec_govI + driv_ec_dS,
## data = datN2OProc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.890 -1.617 -0.064 1.436 187.740
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.955173 0.261787 18.928 < 2e-16 ***
## driv_ec_exp -0.041080 0.004618 -8.895 < 2e-16 ***
## driv_ec_govI 4.587137 0.600130 7.644 2.52e-14 ***
## driv_ec_dS 2.762052 0.280347 9.852 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.858 on 4984 degrees of freedom
## Multiple R-squared: 0.07636, Adjusted R-squared: 0.07581
## F-statistic: 137.4 on 3 and 4984 DF, p-value: < 2.2e-16
#plot predicted emissions against true emissions
p<- ggplot(datN2OProc ,
# new model for emission factors
aes(x= (fitted)*
k_processing_dm
, y= GWP_100_N2O_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
# select all drivers to start the elimination process
# best predictors where found by successive elimination of variables with these commands:
weight= datN2OProc_Model$driv_ec_pop
# lR_mreduced<-elimHelper(datN2OProc_Model, T,50,weight)
#
# #m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced )
#
# #summary(m)
# subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
#
# subsSum<-summary(subs)
# compSubsets<- data.frame(
# Adj.R2 = subsSum$adjr2,
# driv = subsSum$which
# )
# #decide for a model size and look at the best model
# model<- compSubsets["X3",]
# model[which( model==TRUE)]
# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_aff+ksd_production_nr+kli_domestic_supply_nr, weights = weight, datN2OProc )
summary(m)
##
## Call:
## lm(formula = emiFac ~ driv_ec_aff + ksd_production_nr + kli_domestic_supply_nr,
## data = datN2OProc, weights = weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -49.092 -2.923 0.708 5.346 223.228
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.141e+00 4.326e-02 165.06 <2e-16 ***
## driv_ec_aff 6.769e-05 2.479e-06 27.30 <2e-16 ***
## ksd_production_nr -2.574e+00 6.003e-02 -42.88 <2e-16 ***
## kli_domestic_supply_nr 5.690e+00 1.265e-01 44.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.7 on 4984 degrees of freedom
## Multiple R-squared: 0.3111, Adjusted R-squared: 0.3107
## F-statistic: 750.2 on 3 and 4984 DF, p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datN2OProc %>% mutate(fitted= m$fitted.values) -> datN2OProc
#plot predicted emissions against true emissions
p<- ggplot(datN2OProc ,
# new model for emission factors
aes(x= (fitted)*
k_processing_dm
, y= GWP_100_N2O_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
data #%>% filter(Year=="2000" )
, aes(x=(8.716686 + driv_ec_pop * 0.004280 + k_processing_nr *2.2303566 + kcr_processing_nr* -3.3071144 )*
k_processing_dm
, y= GWP_100_N2O_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datN2OProc #%>% filter(Year=="2000" )
, aes(x= GWP_100_N2O_Processing
, y= driv_ec_gdp
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datN2OProc #%>% filter(Year=="2000" )
, aes(x= emiFac
, y= driv_ec_aff
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
Plot the distribution of emission factors and get information about possible outliers.
#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA
#### CO2 processing emissions
#data$driv_exp
# setup data
data %>% filter(kli_processing_wm!=0) %>% mutate(emiFac= GWP_100_CO2_Processing/kli_processing_nr) -> datCO2Proc
hist(datCO2Proc$emiFac,100)
#VIEW EXTREME Emission Facs
datCO2Proc %>% filter(emiFac<0 | emiFac>150000) %>% select(emiFac, GWP_100_CO2_Processing, kli_processing_nr, Region, info_countrynames ,Year) %>% DT::datatable(.)
# setup data for building the model i.e. select possible drivers
datCO2Proc_model <- datCO2Proc %>% select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) )
Build a model with emission factors as dependent variable.
# get final model
model_final <- lm( emiFac ~ driv_ec_dS + ksd_households_nr, datCO2Proc)
summary(model_final)
##
## Call:
## lm(formula = emiFac ~ driv_ec_dS + ksd_households_nr, data = datCO2Proc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38104 -12449 -2324 3908 204453
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 544.7 555.4 0.981 0.327
## driv_ec_dS 35598.3 807.5 44.084 <2e-16 ***
## ksd_households_nr 85162.7 7604.8 11.199 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22330 on 4985 degrees of freedom
## Multiple R-squared: 0.2912, Adjusted R-squared: 0.2909
## F-statistic: 1024 on 2 and 4985 DF, p-value: < 2.2e-16
datCO2Proc %>% mutate(fitted2= model_final$fitted.values) ->datCO2Proc
#plot predicted emissions against true emissions
#new model for emission factors
p<- ggplot(datCO2Proc, aes(x= (544.7+ 35598.3 *driv_ec_dS + 85162.7 *ksd_households_nr )*
kli_processing_nr,
y= GWP_100_CO2_Processing,
label = Region, Year= Year, country= info_countrynames
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
# select all drivers to start the elimination process
# best predictors where found by successive elimination of variables with these commands:
weight= datCO2Proc_model$driv_ec_pop
# lR_mreduced<-elimHelper(datCO2Proc_model, T,50,weight)
#
# #m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced )
#
# #summary(m)
# subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
#
# subsSum<-summary(subs)
# compSubsets<- data.frame(
# Adj.R2 = subsSum$adjr2,
# driv = subsSum$which
# )
# #decide for a model size and look at the best model
# model<- compSubsets["X3",]
# model[which( model==TRUE)]
# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_dS+driv_prod_brans+kli_waste_nr, weights = weight, datCO2Proc )
summary(m)
##
## Call:
## lm(formula = emiFac ~ driv_ec_dS + driv_prod_brans + kli_waste_nr,
## data = datCO2Proc, weights = weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -490943 -26254 -9341 5300 768309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.938e+03 4.527e+02 4.281 1.9e-05 ***
## driv_ec_dS 3.605e+04 5.997e+02 60.120 < 2e-16 ***
## driv_prod_brans 2.770e+03 3.336e+01 83.048 < 2e-16 ***
## kli_waste_nr -1.663e+06 2.921e+04 -56.923 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92530 on 4984 degrees of freedom
## Multiple R-squared: 0.6499, Adjusted R-squared: 0.6497
## F-statistic: 3084 on 3 and 4984 DF, p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datCO2Proc %>% mutate(fitted= m$fitted.values) -> datCO2Proc
#plot predicted emissions against true emissions
p<- ggplot(datCO2Proc ,
# new model for emission factors
aes(x= (fitted)*
kli_processing_nr
, y= GWP_100_CO2_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
data %>% filter(kli_processing_wm!=0) %>% mutate(emiFac= GWP_100_CO2_Processing/kli_processing_nr) -> lR
lR_m <- lR %>% select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) )
#old model for emission factors
p2<- ggplot(data , aes( x= (14614.71 + driv_proc_brans *1651.60 + driv_proc_sugr_beet *5740.25 )*
kli_processing_nr ,
y= GWP_100_CO2_Processing, label = Region, Year= Year ,
country = info_countrynames
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datCO2Proc #%>% filter(Year=="2000" )
, aes(x= GWP_100_CO2_Processing
, y= driv_ec_gdp
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datCO2Proc #%>% filter(Year=="2000" )
, aes(x= emiFac
, y= driv_ec_aff
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
Plot the distribution of emission factors and get information about possible outliers.
data %>% filter(kcr_processing_wm!=0) %>% mutate(emiFac= GWP_100_CH4_Processing/kcr_processing_wm) -> datCH4Proc
#datCH4Proc %>% filter(!(emiFac>0 & emiFac<2000))
#datCH4Proc %<>% filter(emiFac>0 & emiFac<500)
### VIEW NEGATIVE emiFac VALUES
datCH4Proc %>% filter(emiFac< 0|emiFac>3000 ) %>% select(emiFac, GWP_100_CH4_Processing, kcr_processing_wm, Region, Year) %>% DT::datatable(.)
hist(datCH4Proc$emiFac,1000)
Since there are clearly outlier with extremely negative emission factors, I decided to filter those out first.
datCH4Proc %<>% filter(!emiFac< 0, emiFac<3000 )
Built the model
datCH4Proc_model <- datCH4Proc %>% select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) )
#datCH4Proc_model_red <- elimHelper(datCH4Proc_model, F, 20)
#datCH4Proc_model_red <- elimHelper(datCH4Proc_model_red, T, 20)
# ...
#m<- lm(emiFac ~ ., datCH4Proc_model_red)
#k<-olsrr::ols_step_all_possible(m)
model<- lm(emiFac ~ driv_ec_aff , datCH4Proc)
datCH4Proc %>% mutate(fitted= model$fitted.values) %>% mutate(predicted= fitted*kcr_processing_wm ) ->datCH4Proc
summary(model)
##
## Call:
## lm(formula = emiFac ~ driv_ec_aff, data = datCH4Proc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -365.20 -26.91 -8.51 -1.55 2199.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.734e+00 2.034e+00 1.836 0.0664 .
## driv_ec_aff 2.959e-03 9.645e-05 30.676 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.3 on 4970 degrees of freedom
## Multiple R-squared: 0.1592, Adjusted R-squared: 0.159
## F-statistic: 941 on 1 and 4970 DF, p-value: < 2.2e-16
#plot predicted emissions against true emissions
p<- ggplot(
datCH4Proc
, aes(x= (4.2339802 + driv_ec_aff* 0.0031080)*
kcr_processing_wm
, y= GWP_100_CH4_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
# select all drivers to start the elimination process
# best predictors where found by successive elimination of variables with these commands:
weight= datCH4Proc_model$driv_ec_pop
# lR_mreduced<-elimHelper(datCH4Proc_model, T,50,weight)
#m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced )
#summary(m)
#subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
#subsSum<-summary(subs)
#compSubsets<- data.frame(
# Adj.R2 = subsSum$adjr2,
# driv = subsSum$which
# )
#decide for a model size and look at the best model
#model<- compSubsets["X3",]
#model[which( model==TRUE)]
# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_exp+driv_ec_aff+driv_ec_urb, weights = weight, datCH4Proc )
summary(m)
##
## Call:
## lm(formula = emiFac ~ driv_ec_exp + driv_ec_aff + driv_ec_urb,
## data = datCH4Proc, weights = weight)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -548.23 -45.31 -19.66 3.21 2670.27
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.854e+00 1.459e+00 -1.271 0.204
## driv_ec_exp -1.508e-01 1.583e-02 -9.521 <2e-16 ***
## driv_ec_aff 7.059e-04 7.766e-05 9.090 <2e-16 ***
## driv_ec_urb 4.230e+01 3.697e+00 11.443 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 219 on 4968 degrees of freedom
## Multiple R-squared: 0.13, Adjusted R-squared: 0.1295
## F-statistic: 247.4 on 3 and 4968 DF, p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datCH4Proc %>% mutate(fitted= m$fitted.values) -> datCH4Proc
#plot predicted emissions against true emissions
p<- ggplot(datCH4Proc ,
# new model for emission factors
aes(x= (fitted)*
kcr_processing_wm
, y= GWP_100_CH4_Processing
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
p2<- ggplot(
data #%>% filter(Region %in% c("BRA", "IDN", "IND") )
, aes( x= (37.4412095 +
driv_ec_pop * -0.0289276 +
driv_proc_others * 0.2461490)*kcr_processing_wm,
y= GWP_100_CH4_Processing, label = Region, Year= Year ,
country = info_countrynames
#color = -4.212e+02 * driv_dS + driv_aff * 5.742e-03 + driv_govI * 1.019e+03
))+
geom_point(
aes( )
)+
#geom_text(aes(label = Region))+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datCH4Proc #%>% filter(Year=="2000" )
, aes(x= GWP_100_CH4_Processing
, y= driv_ec_gdp
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))
#plot predicted against true emissions with the old method (for comparison)
p<- ggplot(
datCH4Proc #%>% filter(Year=="2000" )
, aes(x= emiFac
, y= driv_ec_aff
, label = Region, Year= Year, country= info_countrynames #,
))+
geom_point()+
geom_smooth(method='lm', formula= y~x, aes(group=1))
plotly::ggplotly(p, tooltip = c("y", "Year", "country"))